library(readxl)
library(dplyr)
library(stringr)
library(purrr)
library(tm)
library(SnowballC)
library(syuzhet)
library(viridis)
library(plotly)
library(readxl)
# Run this library to read excel or csv file. If you do not have it then install the package by using a function call install.packages("readxl")#.
# If you're reading from a CSV or Excel file, you would use something like read.csv() or read_excel() to load your data to a data frame
All_Tweets <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/justdoit_tweets_colin.csv", header = TRUE)
# Prepare the data by selecting relevant columns and cleaning text
All_Selected_Tweets <- All_Tweets %>%
select(tweet_created_at, tweet_favorite_count, tweet_full_text, tweet_id,
tweet_in_reply_to_screen_name, tweet_in_reply_to_status_id, tweet_retweet_count,
user_favourites_count, user_followers_count, user_id, user_location,
user_location) %>%
mutate(tweet_full_text = str_remove_all(tweet_full_text, "–|’|—|“|”|-|&|https://[^\\s]+"),
tweet_full_text = tolower(tweet_full_text),
tweet_full_text = removePunctuation(tweet_full_text),
tweet_full_text = stripWhitespace(tweet_full_text),
tweet_full_text = wordStem(tweet_full_text))
# Sentiment analysis
syuzhet_score <- get_sentiment(All_Selected_Tweets$tweet_full_text, method = "syuzhet")
nrc_score <- get_sentiment(All_Selected_Tweets$tweet_full_text, method = "nrc")
# Combine scores and normalize
Tweet_Threads_Analysis <- cbind(All_Selected_Tweets, syuzhet_score, nrc_score) %>%
mutate(Syuzhet = sign(syuzhet_score),
NRC = sign(nrc_score))
# Emotions and Sentiments visualization
nrc_sentiment <- get_nrc_sentiment(All_Selected_Tweets$tweet_full_text)
sentisum <- colSums(nrc_sentiment)
interactive_bar <- plot_ly(x = names(sentisum), y = sentisum, type = 'bar',
marker = list(color = viridis::viridis(length(names(sentisum)), option = "D"))) %>%
layout(title = 'Emotions and Sentiments', xaxis = list(title = ''), yaxis = list(title = 'Count'))
# Display the plot
interactive_bar
# View the negative comment basis syuzhet score
min(Tweet_Threads_Analysis$Syuzhet)
[1] -1
minScore <- which(Tweet_Threads_Analysis$syuzhet==min(Tweet_Threads_Analysis$Syuzhet))
minScore
[1] 59 139 153 165 224 313 315 328 387 390 434 493 546 655 680 779 831 999
[19] 1047 1070 1087 1102 1119 1124 1157 1163 1190 1195 1253 1320 1428 1486 1550 1582 1619 1682
[37] 1684 1699 1772 1796 1799 1809 1854 1912 1946 1982 2005 2057 2115 2218 2274 2409 2412 2461
[55] 2598 2696 2780 2808 3008 3142 3166 3169 3173 3203 3507 3643 3645 3670 3725 3726 3727 3764
[73] 3814 3852 3927 3965 3994 4088 4103 4219 4289 4312 4345 4386 4501 4584 4607 4664 4711 5083
All_Selected_Tweets[minScore]
Error in `[.data.frame`(All_Selected_Tweets, minScore) :
undefined columns selected
# Displaying the tweets with the minimum Syuzhet score and limiting to 10 tweets
tweets_with_min_syuzhet <- All_Selected_Tweets$tweet_full_text[minScore][1:10]
# To print the tweets
print(tweets_with_min_syuzhet)
[1] "justdoit nike took a knee for kaepernick outrage ensues were you surprised "
[2] "realdonaldtrump they are thinking about the greatness in every single one something that you are incapable of comprehending too bad justdoit 🕡🕡🅡🅡🕡🕡🥡"
[3] "istandwithice icegov justdoit bringiton maga americafirst illegalaliens are criminals buildthatwallnow buildthewall secureourborders borderpatrol waityourturn vote red kag "
[4] "calling a dream crazy its not an insult its a compliment justdoit"
[5] "believe in something even if it means sacrificing every motherfucker in the jungle nike justdoit predator "
[6] "wtf people like gillum and other socialistsdems are why we cannot progress due to digging 150 yrs in the past to paint everyone racists and use black people from physical slavery to mental slavery thedemocrats never change and justdoit "
[7] "bet these are causing havoc for all those in the boycottnike side of thingsthe flag on nikes justdoit kaepernick "
[8] "sometimes you gotta knock a bitch out🝥🍥♀️justdoit nik"
[9] "bigbrothergod1 don43pmdon amen colin didnt sacrifice anything he had a contract and turned down two offers he didnt sacrifice rather than justdoit he just quit and hes still a millionaire and a pouty whiny leftist"
[10] "weve seen the outrage on social media on nikes move to use colin kaepernick as a spokesperson for justdoit according to retail experts at brp despite the backlash its a calculated risk but the ad will only do good things more here nike "
# View the positive comment basis syuzhet score
max(Tweet_Threads_Analysis$Syuzhet)
[1] 1
maxScore <- which(Tweet_Threads_Analysis$syuzhet==max(Tweet_Threads_Analysis$Syuzhet))
maxScore
[1] 38 160 198 200 221 229 243 252 277 285 326 346 359 381 382 458 467 470
[19] 508 566 593 599 647 649 659 728 768 855 889 917 934 936 968 1003 1035 1092
[37] 1093 1098 1114 1118 1143 1159 1167 1185 1188 1201 1233 1254 1313 1343 1369 1382 1447 1450
[55] 1462 1477 1496 1535 1572 1587 1618 1673 1692 1703 1715 1726 1785 1837 1851 1877 1888 1997
[73] 2067 2077 2096 2169 2198 2225 2257 2346 2372 2433 2449 2452 2465 2579 2586 2621 2648 2676
[91] 2706 2709 2729 2784 2795 2799 2813 2908 2952 2982 3028 3102 3155 3185 3188 3196 3211 3238
[109] 3244 3257 3311 3336 3368 3376 3386 3409 3449 3474 3475 3487 3488 3502 3504 3533 3536 3581
[127] 3590 3632 3682 3684 3688 3710 3772 3777 3797 3874 3902 3920 3936 4001 4018 4034 4078 4116
[145] 4193 4242 4353 4354 4367 4374 4378 4389 4406 4423 4432 4476 4528 4615 4637 4653 4682 4709
[163] 4784 4806 4816 4880 4947 4968 4978 4991 4995 5003 5059
All_Selected_Tweets[maxScore]
Error in `[.data.frame`(All_Selected_Tweets, maxScore) :
undefined columns selected
# Displaying the tweets with the maximum Syuzhet score and limiting to 10 tweets
tweets_with_max_syuzhet <- All_Selected_Tweets$tweet_full_text[maxScore][1:10]
# To print the tweets
print(tweets_with_max_syuzhet)
[1] "realdonaldtrump its clear why you kim respect each other so muchdonaldlovesdictators takeaknee colinkaepernick nike justdoit fightfascism "
[2] "im sure imchelseagreen will have her spot in a wwe ring soon after listening to eandcpodcastofawesomeness im even more inspired by her will to justdoit this woman is amazing 🐳🀳🐳🀳🐳🀳🐳🀳🐳🀳❤️❤️❤️❤️❤️❤️day1 fan riding with imchelseagreen all the wai"
[3] "september 19th interest free newmusic fridaymotivation justdoit album westcoast nyc vancouver "
[4] "omg woo me i just loved being wooed me fridayfeeling bigfacts justdoit uapb21 retweeet share listenbetter quote puregoals teamfollowback waitonit uapb21 naturalwoman afro "
[5] "botblocking is therapeutic justdoit "
[6] "clear cut winner with the justdoit memes therock 🥡 "
[7] "open mic 🅮 performed real recognize real last night and had a good time doing it ybm justdoit dowhatmakesyouhappy ybm "
[8] "dont just buy shoes buy stock thelifeeengineer investlikealady livelikeaboss justdoit yesterday i purchased nike stock for all my children my nephew teach them ownership "
[9] "justdoit nikes stance on social justice by thereclaimed "
[10] "alternet why focus on wackiness trump is a threat to the constitution let comedians like kathygriffin make fun of him all other reporting should be on how do we get him out of office🜴🌴🌴justdoit impeachtrump 25amendmentnow "
library(ggplot2)
# Create ggplot histogram
p <- ggplot(Tweet_Threads_Analysis, aes(x = Syuzhet)) + # Ensure the column name matches your dataframe
geom_histogram(bins = 5, fill = "purple", color = "white") +
labs(title = "Distribution of Sentiment Scores", x = "Sentiment Score", y = "Frequency")
library(plotly)
# Convert to Plotly for interactivity
p_interactive <- ggplotly(p)
# Display the interactive plot
p_interactive
# Calculate average sentiment for each state
state_sentiment <- Tweet_Threads_Analysis %>%
group_by(user_location) %>%
summarise(average_sentiment = mean(syuzhet_score, na.rm = TRUE)) %>%
ungroup()
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
top_n(5, average_sentiment)
# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
top_n(-5, average_sentiment)
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
slice_max(order_by = average_sentiment, n = 5)
print(top_5_states, "Top 5 States by Average Sentiment")
# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
slice_min(order_by = average_sentiment, n = 5)
print(bottom_5_states, "Bottom 5 States by Average Sentiment")
# The dataset would require cleaning. Read the file thouroghly and clean/impute data
Cleaned_Tweets <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/Nike_Colin_Tweets_Cleaned.csv", header = TRUE)
# Prepare the data by selecting relevant columns and cleaning text
All_Cleaned_Tweets <- Cleaned_Tweets %>%
select(tweet_created_at, tweet_favorite_count, tweet_full_text, tweet_id,
tweet_retweet_count, user_favourites_count, user_followers_count, user_id, user_location,
user_location_us, Sentiment) %>%
mutate(tweet_full_text = str_remove_all(tweet_full_text, "–|’|—|“|”|-|&|https://[^\\s]+"),
tweet_full_text = tolower(tweet_full_text),
tweet_full_text = removePunctuation(tweet_full_text),
tweet_full_text = stripWhitespace(tweet_full_text),
tweet_full_text = wordStem(tweet_full_text))
# Calculate average sentiment for each state
state_sentiment <- All_Cleaned_Tweets %>%
group_by(user_location_us) %>%
summarise(average_sentiment = mean(Sentiment, na.rm = TRUE)) %>%
# Arrange data in descending order of average_sentiment
arrange(desc(average_sentiment))
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
slice_max(order_by = average_sentiment, n = 5)
print(top_5_states, "Top 5 States by Average Sentiment")
# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
slice_min(order_by = average_sentiment, n = 5)
print(bottom_5_states, "Bottom 5 States by Average Sentiment")
# Let us correlate with the sales for Nike acorss North America with the sales dataset for 2020-2021
Nike_sales_data <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/Nike_Sales_US.csv", header = TRUE)
# Define the list of states you're interested in
top_5_states_sales <- c("Montana", "North Dakota", "Wyoming", "Delaware", "Missouri")
# Summarize the final sales count for the selected states
final_sales_top_selected_states <- Nike_sales_data %>%
filter(State %in% top_5_states_sales) %>%
group_by(State) %>%
summarise(TotalSalesCount = sum(Total.Sales, na.rm = TRUE)) %>%
ungroup() # Optional, to remove the grouping
# Display the result
print(final_sales_top_selected_states)
# Define the list of states you're interested in
bottom_5_states_sales <- c("Oklahoma", "West Virginia", "Alaska", "Nebraska", "Mississippi")
# Summarize the final sales count for the selected states
final_sales_bottom_selected_states <- Nike_sales_data %>%
filter(State %in% bottom_5_states_sales) %>%
group_by(State) %>%
summarise(TotalSalesCount = sum(Total.Sales, na.rm = TRUE)) %>%
ungroup() # Optional, to remove the grouping
# Display the result
print(final_sales_bottom_selected_states)
# normalize sentiment score
All_Cleaned_Tweets <- All_Cleaned_Tweets %>%
mutate(Normalized_Syuzhet = sign(Sentiment))
# Apply the function to positive and negative tweets
positive_tweets <- All_Cleaned_Tweets[All_Cleaned_Tweets$Normalized_Syuzhet > 0, ]
negative_tweets <- All_Cleaned_Tweets[All_Cleaned_Tweets$Normalized_Syuzhet < 0, ]
library(dplyr)
# For Positive Tweets
top_5_positive_states <- positive_tweets %>%
group_by(user_location_us) %>%
summarise(PositiveTweetCount = n()) %>%
arrange(desc(PositiveTweetCount)) %>%
slice_head(n = 5)
# For Negative Tweets
top_5_negative_states <- negative_tweets %>%
group_by(user_location_us) %>%
summarise(NegativeTweetCount = n()) %>%
arrange(desc(NegativeTweetCount)) %>%
slice_head(n = 5)
# Print the results
print("Top 5 States with Highest Positive Tweets:")
[1] "Top 5 States with Highest Positive Tweets:"
print(top_5_positive_states)
print("Top 5 States with Highest Negative Tweets:")
[1] "Top 5 States with Highest Negative Tweets:"
print(top_5_negative_states)
# Assuming the states are stored in a column named 'State' in top_5_positive_states and top_5_negative_states data frames
top_5_positive_states_list <- top_5_positive_states$user_location_us
top_5_negative_states_list <- top_5_negative_states$user_location_us
# Filter tweets from these top 5 states
positive_tweets_top5 <- positive_tweets %>%
filter(user_location_us %in% top_5_positive_states_list)
negative_tweets_top5 <- negative_tweets %>%
filter(user_location_us %in% top_5_negative_states_list)
library(tidytext)
# Tokenize positive tweets and calculate word frequencies
positive_word_freq <- positive_tweets_top5 %>%
unnest_tokens(word, tweet_full_text) %>%
count(word, sort = TRUE)
# Tokenize negative tweets and calculate word frequencies
negative_word_freq <- negative_tweets_top5 %>%
unnest_tokens(word, tweet_full_text) %>%
count(word, sort = TRUE)
data("stop_words")
positive_word_freq <- positive_word_freq %>%
anti_join(stop_words, by = "word")
negative_word_freq <- negative_word_freq %>%
anti_join(stop_words, by = "word")
# Adjust N as needed
N <- 10
head(positive_word_freq, N)
head(negative_word_freq, N)
library(dplyr)
# Assuming N = 10
N <- 10
top_positive_word_freq <- positive_word_freq %>%
head(N)
top_negative_word_freq <- negative_word_freq %>%
head(N)
library(ggplot2)
# Plot for positive word frequencies
ggplot(top_positive_word_freq, aes(x = reorder(word, n), y = n)) +
geom_col(fill = "dodgerblue") +
geom_text(aes(label = n), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +
coord_flip() +
labs(title = "Top 10 Word Frequencies in Positive Tweets",
x = "Word",
y = "Frequency") +
theme_minimal()

# Plot for negative word frequencies
ggplot(top_negative_word_freq, aes(x = reorder(word, n), y = n)) +
geom_col(fill = "firebrick") +
geom_text(aes(label = n), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +
coord_flip() +
labs(title = "Top 10 Word Frequencies in Negative Tweets",
x = "Word",
y = "Frequency") +
theme_minimal()

---
title: "Nike Colin Kaepernick Sentiment Analysis"
output:
  html_document: default
  html_notebook: default
  pdf_document: default
---

```{r}
library(readxl)
library(dplyr)
library(stringr)
library(purrr)
library(tm)
library(SnowballC)
library(syuzhet)
library(viridis)
library(plotly)
```

```{r}
library(readxl)

# Run this library to read excel or csv file. If you do not have it then install the package by using a function call install.packages("readxl")#. 
```

```{r}
# If you're reading from a CSV or Excel file, you would use something like read.csv() or read_excel() to load your data to a data frame

All_Tweets <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/justdoit_tweets_colin.csv", header = TRUE)
```

```{r}
# Prepare the data by selecting relevant columns and cleaning text
All_Selected_Tweets <- All_Tweets %>% 
  select(tweet_created_at, tweet_favorite_count, tweet_full_text, tweet_id,
         tweet_in_reply_to_screen_name, tweet_in_reply_to_status_id, tweet_retweet_count,
         user_favourites_count, user_followers_count, user_id, user_location,
         user_location) %>%
  mutate(tweet_full_text = str_remove_all(tweet_full_text, "–|’|—|“|”|-|&amp|https://[^\\s]+"),
         tweet_full_text = tolower(tweet_full_text),
         tweet_full_text = removePunctuation(tweet_full_text),
         tweet_full_text = stripWhitespace(tweet_full_text),
         tweet_full_text = wordStem(tweet_full_text))
```

```{r}
# Sentiment analysis
syuzhet_score <- get_sentiment(All_Selected_Tweets$tweet_full_text, method = "syuzhet")
nrc_score <- get_sentiment(All_Selected_Tweets$tweet_full_text, method = "nrc")
```

```{r}
# Combine scores and normalize
Tweet_Threads_Analysis <- cbind(All_Selected_Tweets, syuzhet_score, nrc_score) %>%
  mutate(Syuzhet = sign(syuzhet_score),
         NRC = sign(nrc_score))
```

```{r}
# Emotions and Sentiments visualization
nrc_sentiment <- get_nrc_sentiment(All_Selected_Tweets$tweet_full_text)
sentisum <- colSums(nrc_sentiment)
interactive_bar <- plot_ly(x = names(sentisum), y = sentisum, type = 'bar', 
                           marker = list(color = viridis::viridis(length(names(sentisum)), option = "D"))) %>%
  layout(title = 'Emotions and Sentiments', xaxis = list(title = ''), yaxis = list(title = 'Count'))
```

```{r}
# Display the plot
interactive_bar
```

```{r}
# View the negative comment basis syuzhet score

min(Tweet_Threads_Analysis$Syuzhet)
minScore <- which(Tweet_Threads_Analysis$syuzhet==min(Tweet_Threads_Analysis$Syuzhet))
minScore
All_Selected_Tweetss[minScore]
```

```{r}
# Displaying the tweets with the minimum Syuzhet score and limiting to 10 tweets
tweets_with_min_syuzhet <- All_Selected_Tweets$tweet_full_text[minScore][1:10]

# To print the tweets
print(tweets_with_min_syuzhet)
```
```{r}
# View the positive comment basis syuzhet score

max(Tweet_Threads_Analysis$Syuzhet)
maxScore <- which(Tweet_Threads_Analysis$syuzhet==max(Tweet_Threads_Analysis$Syuzhet))
maxScore
All_Selected_Tweets[maxScore]
```

```{r}
# Displaying the tweets with the maximum Syuzhet score and limiting to 10 tweets
tweets_with_max_syuzhet <- All_Selected_Tweets$tweet_full_text[maxScore][1:10]

# To print the tweets
print(tweets_with_max_syuzhet)
```
```{r}
library(ggplot2)

# Create ggplot histogram
p <- ggplot(Tweet_Threads_Analysis, aes(x = Syuzhet)) +  # Ensure the column name matches your dataframe
  geom_histogram(bins = 5, fill = "purple", color = "white") +
  labs(title = "Distribution of Sentiment Scores", x = "Sentiment Score", y = "Frequency")
```


```{r}
library(plotly)

# Convert to Plotly for interactivity
p_interactive <- ggplotly(p)

# Display the interactive plot
p_interactive
```


```{r}
# Calculate average sentiment for each state
state_sentiment <- Tweet_Threads_Analysis %>%
  group_by(user_location) %>%
  summarise(average_sentiment = mean(syuzhet_score, na.rm = TRUE)) %>%
  ungroup()
```

```{r}
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
  top_n(5, average_sentiment)
```

```{r}
# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
  top_n(-5, average_sentiment)
```

```{r}
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
  slice_max(order_by = average_sentiment, n = 5)
print(top_5_states, "Top 5 States by Average Sentiment")

# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
  slice_min(order_by = average_sentiment, n = 5)
print(bottom_5_states, "Bottom 5 States by Average Sentiment")
```

```{r}
# The dataset would require cleaning. Read the file thouroghly and clean/impute data

Cleaned_Tweets <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/Nike_Colin_Tweets_Cleaned.csv", header = TRUE)
```

```{r}
# Prepare the data by selecting relevant columns and cleaning text
All_Cleaned_Tweets <- Cleaned_Tweets %>% 
  select(tweet_created_at, tweet_favorite_count, tweet_full_text, tweet_id,
         tweet_retweet_count, user_favourites_count, user_followers_count, user_id, user_location,
         user_location_us, Sentiment) %>%
  mutate(tweet_full_text = str_remove_all(tweet_full_text, "–|’|—|“|”|-|&amp|https://[^\\s]+"),
         tweet_full_text = tolower(tweet_full_text),
         tweet_full_text = removePunctuation(tweet_full_text),
         tweet_full_text = stripWhitespace(tweet_full_text),
         tweet_full_text = wordStem(tweet_full_text))
```

```{r}
# Calculate average sentiment for each state
state_sentiment <- All_Cleaned_Tweets %>%
  group_by(user_location_us) %>%
  summarise(average_sentiment = mean(Sentiment, na.rm = TRUE)) %>%
# Arrange data in descending order of average_sentiment
  arrange(desc(average_sentiment))
```

```{r}
# Top 5 states with the highest average sentiment
top_5_states <- state_sentiment %>%
  slice_max(order_by = average_sentiment, n = 5)
print(top_5_states, "Top 5 States by Average Sentiment")

# Bottom 5 states with the lowest average sentiment
bottom_5_states <- state_sentiment %>%
  slice_min(order_by = average_sentiment, n = 5)
print(bottom_5_states, "Bottom 5 States by Average Sentiment")
```
```{r}
# Let us correlate with the sales for Nike acorss North America with the sales dataset for 2020-2021

Nike_sales_data <- read.csv("C:/Users/91992/OneDrive/Desktop/GGU/R projects/Nike_Sales_US.csv", header = TRUE)
```

```{r}
# Define the list of states you're interested in
top_5_states_sales <- c("Montana", "North Dakota", "Wyoming", "Delaware", "Missouri")

# Summarize the final sales count for the selected states
final_sales_top_selected_states <- Nike_sales_data %>%
  filter(State %in% top_5_states_sales) %>%
  group_by(State) %>%
  summarise(TotalSalesCount = sum(Total.Sales, na.rm = TRUE)) %>%
  ungroup() # Optional, to remove the grouping
  
# Display the result
print(final_sales_top_selected_states)
```

```{r}
# Define the list of states you're interested in
bottom_5_states_sales <- c("Oklahoma", "West Virginia", "Alaska", "Nebraska", "Mississippi")

# Summarize the final sales count for the selected states
final_sales_bottom_selected_states <- Nike_sales_data %>%
  filter(State %in% bottom_5_states_sales) %>%
  group_by(State) %>%
  summarise(TotalSalesCount = sum(Total.Sales, na.rm = TRUE)) %>%
  ungroup() # Optional, to remove the grouping

# Display the result
print(final_sales_bottom_selected_states)
```
```{r}
# normalize sentiment score
All_Cleaned_Tweets <- All_Cleaned_Tweets %>%
  mutate(Normalized_Syuzhet = sign(Sentiment))
```


```{r}
# Apply the function to positive and negative tweets
positive_tweets <- All_Cleaned_Tweets[All_Cleaned_Tweets$Normalized_Syuzhet > 0, ]
negative_tweets <- All_Cleaned_Tweets[All_Cleaned_Tweets$Normalized_Syuzhet < 0, ]
```

```{r}
library(dplyr)

# For Positive Tweets
top_5_positive_states <- positive_tweets %>%
  group_by(user_location_us) %>%
  summarise(PositiveTweetCount = n()) %>%
  arrange(desc(PositiveTweetCount)) %>%
  slice_head(n = 5)

# For Negative Tweets
top_5_negative_states <- negative_tweets %>%
  group_by(user_location_us) %>%
  summarise(NegativeTweetCount = n()) %>%
  arrange(desc(NegativeTweetCount)) %>%
  slice_head(n = 5)
```

```{r}
# Print the results
print("Top 5 States with Highest Positive Tweets:")
print(top_5_positive_states)

print("Top 5 States with Highest Negative Tweets:")
print(top_5_negative_states)
```
```{r}
# Store the states in top_5_positive_states and top_5_negative_states data frames
top_5_positive_states_list <- top_5_positive_states$user_location_us
top_5_negative_states_list <- top_5_negative_states$user_location_us

# Filter tweets from these top 5 states
positive_tweets_top5 <- positive_tweets %>%
  filter(user_location_us %in% top_5_positive_states_list)

negative_tweets_top5 <- negative_tweets %>%
  filter(user_location_us %in% top_5_negative_states_list)
```

```{r}
library(tidytext)

# Tokenize positive tweets and calculate word frequencies
positive_word_freq <- positive_tweets_top5 %>%
  unnest_tokens(word, tweet_full_text) %>%
  count(word, sort = TRUE)

# Tokenize negative tweets and calculate word frequencies
negative_word_freq <- negative_tweets_top5 %>%
  unnest_tokens(word, tweet_full_text) %>%
  count(word, sort = TRUE)
```

```{r}
data("stop_words")

positive_word_freq <- positive_word_freq %>%
  anti_join(stop_words, by = "word")

negative_word_freq <- negative_word_freq %>%
  anti_join(stop_words, by = "word")
```

```{r}
# Adjust N as needed
N <- 10

head(positive_word_freq, N)
head(negative_word_freq, N)
```

```{r}
library(dplyr)

# Assuming N = 10
N <- 10

top_positive_word_freq <- positive_word_freq %>%
  head(N)

top_negative_word_freq <- negative_word_freq %>%
  head(N)
```


```{r}
library(ggplot2)

# Plot for positive word frequencies
ggplot(top_positive_word_freq, aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = n), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +
  coord_flip() +
  labs(title = "Top 10 Word Frequencies in Positive Tweets",
       x = "Word",
       y = "Frequency") +
  theme_minimal()

# Plot for negative word frequencies
ggplot(top_negative_word_freq, aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "firebrick") +
  geom_text(aes(label = n), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +    
  coord_flip() +
  labs(title = "Top 10 Word Frequencies in Negative Tweets",
       x = "Word",
       y = "Frequency") +
  theme_minimal()
```

